# Libraries for parsing data
import os
import pandas as pd
import xml.etree.ElementTree as ET
from lxml import etree
from bs4 import BeautifulSoup
import re
import numpy as np
path_dropbox = "D:/Dropbox/Research/China Foreign Share Discount"
def callExtract(path, filename):
tree = ET.parse(os.path.join(path, filename))
root = tree.getroot()
# from attributes
eventId = root.attrib['Id']
eventTypeId = root.attrib['eventTypeId']
eventTypeName = root.attrib['eventTypeName']
# from children's text
eventTitle = root.find('eventTitle').text
city = root.find('city').text
companyName = root.find('companyName').text
Ticker = root.find('companyTicker').text
Date = root.find('startDate').text
companyId = root.find('companyId').text
CUSIP = root.find('CUSIP').text
SEDOL = root.find('SEDOL').text
ISIN = root.find('ISIN').text
# participant list
text = root[0][1].text
participantPattern = re.search('(Conference Call Participants\n=+)(\n.*?)(\n\n=+)', text, flags = re.DOTALL)
if participantPattern:
participantText = participantPattern.group(2)
participant = participantText.split('\n * ')[1:]
else:
participant = []
return [filename, eventId, eventTypeId, eventTypeName, eventTitle, city, companyName, Ticker, Date, companyId, CUSIP, SEDOL, ISIN, participant]
df = pd.DataFrame([], columns = ['filename', 'eventId', 'eventTypeId', 'eventTypeName', 'eventTitle', 'city', 'companyName', 'Ticker', 'Date', 'companyId', 'CUSIP', 'SEDOL', 'ISIN', 'participants'])
for year in range(2001, 2024):
path = "E:/Transcripts/" + str(year)
for filename in os.listdir(path):
row = callExtract(path, filename)
df.loc[len(df.index)] = row
print(str(year) + ' is done!!!')
2001 is done!!! 2002 is done!!! 2003 is done!!! 2004 is done!!! 2005 is done!!! 2006 is done!!! 2007 is done!!! 2008 is done!!! 2009 is done!!! 2010 is done!!! 2011 is done!!! 2012 is done!!! 2013 is done!!! 2014 is done!!! 2015 is done!!! 2016 is done!!! 2017 is done!!! 2018 is done!!! 2019 is done!!! 2020 is done!!! 2021 is done!!! 2022 is done!!! 2023 is done!!!
import pickle
with open(path_dropbox + '/Conference Call Transcript/transcript.pkl', 'wb') as file:
pickle.dump(df, file)